Chiahui Chen (Abby), Kwun Wah Michael Chiu, Yiqing Huang, Yan (Ava) Zhang
Since 2008, guests and hosts have used Airbnb to expand on traveling possibilities and present a more unique, personalized way of experiencing the world. This dataset describes the listing activity and metrics in Manhattan, NY for October 2020, containing 20K+ datasets.
Data Source: http://insideairbnb.com/get-the-data.html
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%autosave 0
Autosave disabled
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
from math import radians, cos, sin, asin, sqrt
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
import sklearn.tree as tree
from IPython.display import Image
import pydotplus
%pylab inline
Populating the interactive namespace from numpy and matplotlib
#Initial data sets
listing=pd.read_csv("listings_2.csv")
review = pd.read_csv('reviews_2.csv')
listing.columns
Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
'neighborhood_overview', 'picture_url', 'host_id', 'host_url',
'host_name', 'host_since', 'host_location', 'host_about',
'host_response_time', 'host_response_rate', 'host_acceptance_rate',
'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
'host_neighbourhood', 'host_listings_count',
'host_total_listings_count', 'host_verifications',
'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
'maximum_minimum_nights', 'minimum_maximum_nights',
'maximum_maximum_nights', 'minimum_nights_avg_ntm',
'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability',
'availability_30', 'availability_60', 'availability_90',
'availability_365', 'calendar_last_scraped', 'number_of_reviews',
'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review',
'last_review', 'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'review_scores_checkin',
'review_scores_communication', 'review_scores_location',
'review_scores_value', 'license', 'instant_bookable',
'calculated_host_listings_count',
'calculated_host_listings_count_entire_homes',
'calculated_host_listings_count_private_rooms',
'calculated_host_listings_count_shared_rooms', 'reviews_per_month'],
dtype='object')
The review data set has:
#sentiment analysis
review1 = review.merge(listing[['id','neighbourhood_group_cleansed', 'neighbourhood_cleansed','property_type']] ,\
left_on='listing_id', right_on='id')
rm = review1[review1.neighbourhood_group_cleansed=='Manhattan']
rm.drop(['id_x','date','id_y','reviewer_id','reviewer_name'],axis=1,inplace=True)
rm.comments.fillna(value='',inplace=True)
review1.comments.fillna(value='',inplace=True)
rm.isna().any()
listing_id False comments False neighbourhood_group_cleansed False neighbourhood_cleansed False property_type False dtype: bool
The listing data set has:
The columns are:
neighbourhood_group_cleansed : Borough
Variables related to Host profile:
host_id, host_url,host_name, host_since, host_location, host_about,host_response_time, host_response_rate, host_acceptance_rate, host_is_superhost, host_thumbnail_url, host_picture_url,host_neighbourhood, host_listings_count, host_total_listings_count, host_verifications, host_has_profile_pic, host_identity_verified, neighbourhood, calculated_host_listings_count, calculated_host_listings_count_entire_homes, calculated_host_listings_count_private_rooms, calculated_host_listings_count_shared_rooms, reviews_per_month
Variables related to listing profile:
latitude,longitude, property_type, room_type, accommodates, bathrooms, bathrooms_text, bedrooms, beds, amenities, price, minimum_nights, maximum_nights
Variables related to review rating:
number_of_reviews, number_of_reviews_ltm, number_of_reviews_l30d, first_review,last_review, review_scores_rating, review_scores_accuracy, review_scores_cleanliness, review_scores_checkin, review_scores_communication, review_scores_location, review_scores_value, instant_bookable
#price prediction
listing_nyc=listing[listing.neighbourhood_group_cleansed=="Manhattan"]
listing_nyc.price.dtypes
dtype('O')
listing_nyc["price"]=listing_nyc["price"].str.replace("[\$\.\,]","").astype(int)
listing_nyc["price"]=listing_nyc["price"]/100.00
listing_nyc.isna().sum()[listing_nyc.isna().sum()>1]
name 9 description 446 neighborhood_overview 8092 host_name 4 host_since 4 host_location 62 host_about 8271 host_response_time 10442 host_response_rate 10442 host_acceptance_rate 7730 host_is_superhost 4 host_thumbnail_url 4 host_picture_url 4 host_neighbourhood 4064 host_listings_count 4 host_total_listings_count 4 host_has_profile_pic 4 host_identity_verified 4 neighbourhood 8092 bathrooms 19741 bathrooms_text 51 bedrooms 2718 beds 203 minimum_minimum_nights 6 maximum_minimum_nights 6 minimum_maximum_nights 6 maximum_maximum_nights 6 minimum_nights_avg_ntm 6 maximum_nights_avg_ntm 6 calendar_updated 19741 first_review 5279 last_review 5279 review_scores_rating 5760 review_scores_accuracy 5773 review_scores_cleanliness 5766 review_scores_checkin 5774 review_scores_communication 5768 review_scores_location 5778 review_scores_value 5776 license 19741 reviews_per_month 5279 dtype: int64
#superhost
listing_nyc["host_is_superhost"].fillna("f",inplace=True)
listing_nyc['host_is_superhost1'] = listing_nyc['host_is_superhost']
listing_nyc["host_is_superhost1"].replace(["t", "f"], [1, 0], inplace = True)
# Convert the variables of superhost indicator to better presenting variables
listing_nyc['host_is_superhost'].replace('t','SuperHost',inplace = True)
listing_nyc['host_is_superhost'].replace('f','RegularHost',inplace =True)
# Convert host response rate to numerical variables
listing_nyc["host_response_rate"] = listing_nyc["host_response_rate"].str.replace("%", "")
listing_nyc["host_response_rate"] = listing_nyc["host_response_rate"].astype(float) / 100
listing_nyc["host_response_rate"].fillna(0,inplace=True)
# Convert host acceptance rate to numerical variables
listing_nyc["host_acceptance_rate"] = listing_nyc["host_acceptance_rate"].str.replace("%", "")
listing_nyc["host_acceptance_rate"] = listing_nyc["host_acceptance_rate"].astype(float) / 100
listing_nyc["host_acceptance_rate"].fillna(0,inplace=True)
listing_nyc["host_response_time"].fillna("No responsing",inplace=True)
listing_nyc["host_has_profile_pic"].fillna("Missing",inplace=True)
listing_nyc["host_identity_verified"].fillna("Missing",inplace=True)
listing_nyc.has_availability.replace(["t", "f"], [1, 0], inplace = True)
listing_nyc.instant_bookable.replace(["t", "f"], [1, 0], inplace = True)
listing_nyc["bathrooms_text"]=listing_nyc.bathrooms_text.str.lower()
listing_nyc.bathrooms_text.unique()
array(['1 bath', '1 shared bath', '1 private bath', nan,
'shared half-bath', '2 baths', '1.5 baths', 'half-bath',
'0 shared baths', '1.5 shared baths', '2.5 baths', '3 baths',
'private half-bath', '0 baths', '2 shared baths',
'2.5 shared baths', '5 baths', '3.5 baths', '4 baths', '5.5 baths',
'8 shared baths', '4.5 baths', '3 shared baths',
'3.5 shared baths', '4 shared baths', '6.5 baths',
'4.5 shared baths', '7.5 baths', '6 baths', '8 baths'],
dtype=object)
listing_nyc["bathrooms_text"].replace(to_replace=["half-bath","private half-bath"],value="0.5 bath",inplace=True)
listing_nyc["bath_shared"]=listing_nyc.bathrooms_text.str.contains("shared")*1.0
listing_nyc[listing_nyc["bath_shared"].isna() ]["room_type"].unique()
array(['Private room', 'Shared room', 'Entire home/apt', 'Hotel room'],
dtype=object)
listing_nyc.loc[(listing_nyc["bath_shared"].isna())&(listing_nyc["room_type"]=='Shared room'),"bath_shared"]=1.0
listing_nyc.loc[(listing_nyc["bath_shared"].isna()),"bath_shared"]=0.0
listing_nyc["bath_count"]=listing_nyc.bathrooms_text.str.extract('(\d+\.?\d*)', expand=False).astype(float)
listing_nyc["bath_count"].fillna(1,inplace=True)
listing_nyc.drop(columns="bathrooms_text",inplace=True)
listing_nyc["bedrooms"].fillna(1,inplace=True)
listing_nyc["beds"].fillna(1,inplace=True)
listing_nyc["host_listings_count"].fillna(1,inplace=True)
listing_nyc["host_total_listings_count"].fillna(1,inplace=True)
#choose 40 percentile of the score for Nan
rev_rating=listing_nyc.review_scores_rating.quantile(0.4)
listing_nyc.review_scores_rating.fillna(rev_rating,inplace=True)
rev_accuracy=listing_nyc.review_scores_accuracy.quantile(0.4)
listing_nyc.review_scores_accuracy.fillna(rev_accuracy,inplace=True)
rev_clean=listing_nyc.review_scores_cleanliness.quantile(0.4)
listing_nyc.review_scores_cleanliness.fillna(rev_clean,inplace=True)
rev_checkin=listing_nyc.review_scores_checkin.quantile(0.4)
listing_nyc.review_scores_checkin.fillna(rev_checkin,inplace=True)
rev_communi=listing_nyc.review_scores_communication.quantile(0.4)
listing_nyc.review_scores_communication.fillna(rev_communi,inplace=True)
rev_location=listing_nyc.review_scores_location.quantile(0.4)
listing_nyc.review_scores_location.fillna(rev_location,inplace=True)
rev_value=listing_nyc.review_scores_value.quantile(0.4)
listing_nyc.review_scores_value.fillna(rev_value,inplace=True)
listing_nyc.reviews_per_month.fillna(0,inplace=True)
listing_nyc.property_type.unique()
array(['Entire apartment', 'Private room in apartment',
'Entire condominium', 'Private room in condominium', 'Entire loft',
'Private room in loft', 'Entire townhouse', 'Entire house',
'Private room in bed and breakfast', 'Shared room in apartment',
'Private room in house', 'Private room in guest suite',
'Private room in townhouse', 'Tiny house',
'Private room in hostel', 'Entire place', 'Entire guest suite',
'Entire serviced apartment', 'Private room in guesthouse',
'Room in serviced apartment', 'Room in hostel',
'Room in aparthotel', 'Shared room in condominium', 'Entire villa',
'Entire resort', 'Private room', 'Shared room', 'Room in hotel',
'Room in boutique hotel', 'Shared room in house',
'Private room in resort', 'Entire guesthouse', 'Entire floor',
'Shared room in loft', 'Shared room in townhouse',
'Entire timeshare', 'Room in resort', 'Entire home/apt',
'Private room in tent', 'Shared room in guest suite',
'Private room in serviced apartment',
'Shared room in bed and breakfast', 'Private room in barn',
'Room in bed and breakfast', 'Camper/RV', 'Shared room in island',
'Shared room in tent', 'Shared room in boutique hotel',
'Shared room in guesthouse', 'Private room in camper/rv',
'Private room in island', 'Shared room in hostel',
'Private room in tiny house'], dtype=object)
listing_nyc["property_type"]=listing_nyc.property_type.apply(lambda x:x.split(" ")[-1:][0].lower())
property_dict={"apartment":["apartment","loft","place","suite","guesthouse",'room',"home/apt",'floor'],
"house":["condominium","townhouse","house","villa","timeshare"],
"hotel":["aparthotel","resort","hotel"],
"hostel":["breakfast","hostel"],
"other":["tent","barn","camper/rv","island"]}
def get_key(val,dictionary):
for key, value in dictionary.items():
if val in value:
return key
%%time
listing_nyc["property_type_cat"]=listing_nyc["property_type"].apply(lambda x:get_key(x,property_dict))
CPU times: user 8.02 ms, sys: 224 µs, total: 8.25 ms Wall time: 8.21 ms
listing_nyc["property_type_cat"].isna().sum()
0
listing_nyc.drop(columns="property_type",inplace=True)
type(listing_nyc.amenities[0])
listing_nyc.amenities[0]
str
'["Hot water", "Stove", "Extra pillows and blankets", "Smoke alarm", "Laptop-friendly workspace", "Coffee maker", "Bed linens", "Dishes and silverware", "Oven", "Wifi", "Air conditioning", "Paid parking off premises", "Baking sheet", "Fire extinguisher", "Free street parking", "Essentials", "Heating", "Kitchen", "Carbon monoxide alarm", "Cooking basics", "Refrigerator", "Hair dryer", "Hangers", "Keypad", "Bathtub", "Ethernet connection", "TV", "Iron", "Cleaning before checkout", "Long term stays allowed", "Luggage dropoff allowed"]'
def amenities_count(s):
A=s.strip("][").replace('"','').split(', ')
return len(A)
listing_nyc["amenities_count"]=listing_nyc.amenities.apply(lambda x:amenities_count(x))
listing_nyc.drop(columns="amenities",inplace=True)
listing_Mah=listing_nyc.copy()
drop_list=["availability_30","availability_365","availability_60","availability_90",\
"bathrooms","license","calendar_last_scraped","calendar_updated","first_review",\
"has_availability","host_about","host_acceptance_rate","host_has_profile_pic","host_identity_verified",\
"host_is_superhost","host_location","host_name","host_neighbourhood","host_picture_url",\
"host_response_rate","host_response_time","host_thumbnail_url","host_url","instant_bookable",\
"last_review","last_scraped","listing_url","name","neighborhood_overview","neighbourhood",\
"neighbourhood_group_cleansed","picture_url","scrape_id",\
"minimum_minimum_nights","maximum_minimum_nights","minimum_maximum_nights",\
"maximum_maximum_nights","minimum_nights_avg_ntm","maximum_nights_avg_ntm",\
"description","host_since","host_verifications"]
listing_Mah.drop(columns=drop_list,inplace=True)
listing_Mah.isna().sum().sum()
0
df2= listing_nyc[['host_response_rate', 'host_acceptance_rate',\
'host_is_superhost', 'host_is_superhost1','host_listings_count', 'host_total_listings_count',\
'host_has_profile_pic', 'host_identity_verified', 'accommodates', 'bedrooms', 'beds',\
'price', 'minimum_nights', 'maximum_nights', 'has_availability', 'availability_30',\
'availability_60', 'availability_90', 'availability_365',\
'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d',\
'review_scores_rating', 'review_scores_accuracy',\
'review_scores_cleanliness', 'review_scores_checkin',\
'review_scores_communication', 'review_scores_location',\
'review_scores_value', 'instant_bookable',\
'calculated_host_listings_count',\
'calculated_host_listings_count_entire_homes',\
'calculated_host_listings_count_private_rooms',\
'calculated_host_listings_count_shared_rooms', 'reviews_per_month',\
'host_response_time',\
'amenities_count','latitude', 'longitude','neighbourhood_cleansed',"neighbourhood_group_cleansed","property_type_cat"]]
df2.isna().sum().sum()
0
#Install NLTK: run pip install --user -U nltk
#Install Numpy (optional): run pip install --user -U numpy
#Test installation: run python then type import nltk
def num_of_words(df):
df['word_count'] = df['comments'].apply(lambda x : len(str(x).split(" ")))
print(df[['comments','word_count']].head())
num_of_words(rm)
comments word_count 0 Notre séjour de trois nuits.\r\nNous avons app... 120 1 Great experience. 2 2 I've stayed with my friend at the Midtown Cast... 89 3 We've been staying here for about 9 nights, en... 66 4 We had a wonderful stay at Jennifer's charming... 24
rm.word_count.sum()
19243008
def CleanString(string):
res = ''
for w in string.replace('@',' ').replace('‘',' ').replace('.',' ').replace('/',' ').replace('!',' ').replace('#',' ').replace(':',' ').replace('\\',' ').replace(',',' ').replace('-',' ').replace('rt',' ').replace('&',' ').split():
res += w.lower()
res += ' '
return res.strip()
%%time
rm['cleaned_comments'] = rm.comments.apply(CleanString)
CPU times: user 6.45 s, sys: 114 ms, total: 6.56 s Wall time: 6.63 s
#Stop Words Removal
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
def stop_words_removal(df):
df['cleaned_comments'] = df['cleaned_comments'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
print(df['cleaned_comments'].head())
%%time
stop_words_removal(rm)
0 notre séjour de trois nuits nous avons appréci... 1 great experience 2 i've stayed friend midtown castle six days lov... 3 we've staying 9 nights enjoying center city ne... 4 wonderful stay jennifer's charming apa ment or... Name: cleaned_comments, dtype: object CPU times: user 32.3 s, sys: 178 ms, total: 32.5 s Wall time: 32.7 s
rm.head()
| listing_id | comments | neighbourhood_group_cleansed | neighbourhood_cleansed | property_type | word_count | cleaned_comments | |
|---|---|---|---|---|---|---|---|
| 0 | 2595 | Notre séjour de trois nuits.\r\nNous avons app... | Manhattan | Midtown | Entire apartment | 120 | notre séjour de trois nuits nous avons appréci... |
| 1 | 2595 | Great experience. | Manhattan | Midtown | Entire apartment | 2 | great experience |
| 2 | 2595 | I've stayed with my friend at the Midtown Cast... | Manhattan | Midtown | Entire apartment | 89 | i've stayed friend midtown castle six days lov... |
| 3 | 2595 | We've been staying here for about 9 nights, en... | Manhattan | Midtown | Entire apartment | 66 | we've staying 9 nights enjoying center city ne... |
| 4 | 2595 | We had a wonderful stay at Jennifer's charming... | Manhattan | Midtown | Entire apartment | 24 | wonderful stay jennifer's charming apa ment or... |
def FindPositive (comments, positive):
count = 0
for w in comments.split():
if w in positive:
count+=1
return count
#set
positive=[]
f=open ('positive.txt','r')
for w in f:
w = w.lower()
w = w.strip()
positive.append(w)
positive_set=set(positive)
len(positive_set)
#positive
f.close()
2006
%%time
#get positive
rm['p_count'] = rm.cleaned_comments.apply(FindPositive, positive=positive_set)
#rm2 = rm [:10000]
#rm2['p_count'] = rm2.cleaned_comments.apply(FindPositive, positive=positive)
CPU times: user 1.73 s, sys: 17.1 ms, total: 1.75 s Wall time: 1.76 s
def FindNegative (comments, negative):
count = 0
for w in comments.split():
if w in negative:
count+=1
return count
negative=[]
f=open ('negative.txt','r')
for w in f:
w = w.lower()
w = w.strip()
negative.append(w)
negative_set=set(negative)
len(negative_set)
#negative
f.close()
4783
%%time
#get negative
rm['n_count'] = rm.cleaned_comments.apply(FindNegative, negative=negative_set)
#rm2 = rm [:10000]
#rm2['n_count'] = rm2.cleaned_comments.apply(FindNegative, negative=negative)
CPU times: user 1.77 s, sys: 14.5 ms, total: 1.79 s Wall time: 1.8 s
#conda install -c conda-forge textblob
from textblob import TextBlob
%%time
rm['senti_score'] = rm.cleaned_comments.apply(lambda x: TextBlob(x).sentiment.polarity)
rm.columns
CPU times: user 1min 34s, sys: 434 ms, total: 1min 34s Wall time: 1min 35s
Index(['listing_id', 'comments', 'neighbourhood_group_cleansed',
'neighbourhood_cleansed', 'property_type', 'word_count',
'cleaned_comments', 'p_count', 'n_count', 'senti_score'],
dtype='object')
#Manhattan review vibes
p_ratio = (rm.p_count.sum() /rm.word_count.sum())
p_ratio
n_ratio = (rm.n_count.sum() /rm.word_count.sum())
n_ratio
avg_score = rm.senti_score.mean()
avg_score
0.08724280528283312
0.00872457154307684
0.3774352861903678
vibes=pd.DataFrame({'p_ratio':[rm.p_count.sum() /rm.word_count.sum()], \
'n_ratio':[rm.n_count.sum() /rm.word_count.sum()]})
vibes
| p_ratio | n_ratio | |
|---|---|---|
| 0 | 0.087243 | 0.008725 |
sns.catplot(data=vibes,kind="bar",height=3)
<seaborn.axisgrid.FacetGrid at 0x7fd5fedb06a0>
gb = rm.groupby('neighbourhood_cleansed').senti_score.mean()
gb_frame = gb.to_frame().reset_index().sort_values('senti_score', ascending=False)
gb_frame.head()
| neighbourhood_cleansed | senti_score | |
|---|---|---|
| 26 | Tribeca | 0.426927 |
| 6 | Financial District | 0.424213 |
| 0 | Battery Park City | 0.422929 |
| 23 | SoHo | 0.420894 |
| 24 | Stuyvesant Town | 0.417126 |
sns.catplot(x='senti_score', y='neighbourhood_cleansed',\
data=rm, kind='bar', aspect=1, height=15, order=gb_frame.neighbourhood_cleansed.values)
<seaborn.axisgrid.FacetGrid at 0x7fd5d310a8e0>
gb = rm.groupby('neighbourhood_cleansed').senti_score.mean()
gb_frame = gb.to_frame().reset_index().sort_values('senti_score', ascending=False)
gb_frame.head()
sns.catplot(x='senti_score', y='neighbourhood_cleansed',\
data=rm, kind='bar', aspect=1, height=15, order=gb_frame.neighbourhood_cleansed.values)
| neighbourhood_cleansed | senti_score | |
|---|---|---|
| 26 | Tribeca | 0.426927 |
| 6 | Financial District | 0.424213 |
| 0 | Battery Park City | 0.422929 |
| 23 | SoHo | 0.420894 |
| 24 | Stuyvesant Town | 0.417126 |
<seaborn.axisgrid.FacetGrid at 0x7fd5d3c056a0>
import plotly.express as px
import json
#import geojson
with open('neighbourhoods.geojson') as f:
gj = json.load(f)
fig = px.choropleth_mapbox(gb_frame, geojson=gj, color="senti_score",
locations="neighbourhood_cleansed", featureidkey="properties.neighbourhood",
center={"lat": 40.78, "lon": -73.97},
mapbox_style="carto-positron", zoom=10, hover_name='neighbourhood_cleansed',
hover_data={'neighbourhood_cleansed': False, 'senti_score':True})
fig.show()
pt = rm.groupby('property_type').senti_score.mean()
pt_frame = pt.to_frame().reset_index().sort_values('senti_score', ascending=False)
pt_frame.head()
| property_type | senti_score | |
|---|---|---|
| 47 | Shared room in tent | 0.562500 |
| 45 | Shared room in island | 0.487004 |
| 29 | Room in aparthotel | 0.462939 |
| 49 | Tiny house | 0.456067 |
| 5 | Entire home/apt | 0.454846 |
sns.catplot(x='senti_score', y='property_type',\
data=rm, kind='bar', aspect=1, height=15, order=pt_frame.property_type.values)
<seaborn.axisgrid.FacetGrid at 0x7fd5d4f4c7f0>
Findings: Overall there is a Positive vibe from the listings at Manhattan.
Managerial insights: This data is very insightful and can help Airbnb hosts improve their customers’ experience. For example, a shared room in a tent has the most positive results among all types, which shows people enjoy novel experience nowadays. East Harlem has the least sentiment score, hosts in that area could work on their services to boost their positive reviews.
We are going to explore the pattern of house prices based on house features and outer data sources such as the location of NYC transit and popular attractions to predict price by leveraging regression models.
listing_Mah.price.describe()
count 19741.000000 mean 194.142496 std 432.154775 min 0.000000 25% 81.000000 50% 128.000000 75% 200.000000 max 10000.000000 Name: price, dtype: float64
fig, axes = plt.subplots(1,2, figsize=(21,6))
colors = ["#FF5A5F","#00A699","#767676"]
sns.set_palette(sns.color_palette(colors))
sns.distplot(listing_Mah['price'], ax=axes[0])
sns.distplot(np.log1p(listing_Mah['price']), ax=axes[1])
axes[1].set_xlabel('log(1+price)');
listing_Mah["log_price"]=np.log1p(listing_Mah['price'])
In this part, we decide to use log_price as our dependent variable in the ML process since the distribution of price is extreme skewness.
cor=listing_Mah.corr()
mask = np.triu(np.ones_like(cor, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(cor, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5});
Using List_of_Neighborhood.csv to assign the Neighborhood with area label. (Downtown, Islands,Midtown, Midtown and Downtown,Uptown)
Data Source:https://en.wikipedia.org/wiki/List_of_Manhattan_neighborhoods
This is a list of neighborhoods in the New York City borough of Manhattan arranged geographically from the north of the island to the south.
NYC_neigh_list=pd.read_csv("List_of_Neighborhood.csv",index_col="Name of the Neighborhood")
NYC_neigh_list.head()
| List of Manhattan neighborhoods | Definition | |
|---|---|---|
| Name of the Neighborhood | ||
| Upper Manhattan | Uptown | above 95th St |
| Marble Hill | Uptown | physically located on the mainland |
| Inwood | Uptown | above Dyckman St |
| Fort George | Uptown | Between Nagel Ave, Dyckman St and 192nd, inclu... |
| Washington Heights | Uptown | W155 (once 125) to Dyckman St |
NYC_neigh_list=NYC_neigh_list.iloc[:,0:1]
NYC_neigh_dict=NYC_neigh_list.T.to_dict("list")
def categorized_neigh(neighbourhood,NYC_neigh_dict):
if neighbourhood in NYC_neigh_dict:
return NYC_neigh_dict[neighbourhood][0]
listing_Mah["List_of_Neighborhood"]=listing_Mah.neighbourhood_cleansed\
.apply(lambda x:categorized_neigh(x,NYC_neigh_dict))
listing_Mah.List_of_Neighborhood.isna().sum()
0
listing_Mah.drop(columns="neighbourhood_cleansed",inplace=True)
listing_Mah.groupby("List_of_Neighborhood")\
.agg({"price":["mean",lambda x:x.max()-x.min()],"id":"count"})\
.rename(columns={"<lambda_0>":"Price range","mean":"Avg Price","count":"Number of listing"})
| price | id | ||
|---|---|---|---|
| Avg Price | Price range | Number of listing | |
| List_of_Neighborhood | |||
| Downtown | 219.948064 | 10000.0 | 5443 |
| Islands | 99.540000 | 1371.0 | 91 |
| Midtown | 245.244506 | 10000.0 | 4150 |
| Midtown and Downtown | 215.074284 | 6790.0 | 1865 |
| Uptown | 147.394204 | 10000.0 | 8192 |
sns.catplot(x="log_price",y="List_of_Neighborhood",data=listing_Mah,kind='box',aspect=2)
<seaborn.axisgrid.FacetGrid at 0x7fd5d2e224c0>
listing_Mah.groupby("property_type_cat").id.count()
property_type_cat apartment 17590 hostel 102 hotel 755 house 1281 other 13 Name: id, dtype: int64
listing_Mah.groupby("room_type").id.count()
sns.countplot(x='room_type', data=listing_Mah,alpha=0.7)
plt.show;
sns.distplot(listing_Mah[listing_Mah['room_type']=='Hotel room'].log_price, kde=False,label='Hotel room',color='black')
sns.distplot(listing_Mah[listing_Mah['room_type']=='Private room'].log_price, kde=False,label='Private room',color='lightblue')
sns.distplot(listing_Mah[listing_Mah['room_type']=='Entire home/apt'].log_price, kde=False,label='Entire home/apt')
sns.distplot(listing_Mah[listing_Mah['room_type']=='Shared room'].log_price, kde=False,label='Shared room',color='green')
plt.title('Log_Price Distribution by Room Type')
plt.xlabel('log_price')
plt.legend()
plt.grid(False);
Using NYC_Subway.csv to obtain the location info of subway stations in NYC so that we can utilize it to calculate the distance and the number of stations close to listing in 500m.
Data Source: https://data.ny.gov/Transportation/NYC-Transit-Subway-Entrance-And-Exit-Data/i9wp-a4ja
NYC_Sub=pd.read_csv("NYC_Subway.csv")
NYC_Sub.head()
| Division | Line | Station Name | Station Latitude | Station Longitude | Route1 | Route2 | Route3 | Route4 | Route5 | ... | ADA | ADA Notes | Free Crossover | North South Street | East West Street | Corner | Entrance Latitude | Entrance Longitude | Station Location | Entrance Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BMT | 4 Avenue | 25th St | 40.660397 | -73.998091 | R | NaN | NaN | NaN | NaN | ... | False | NaN | False | 4th Ave | 25th St | SW | 40.660489 | -73.998220 | (40.660397, -73.998091) | (40.660489, -73.99822) |
| 1 | BMT | 4 Avenue | 25th St | 40.660397 | -73.998091 | R | NaN | NaN | NaN | NaN | ... | False | NaN | False | 4th Ave | 25th St | SE | 40.660323 | -73.997952 | (40.660397, -73.998091) | (40.660323, -73.997952) |
| 2 | BMT | 4 Avenue | 36th St | 40.655144 | -74.003549 | N | R | NaN | NaN | NaN | ... | False | NaN | True | 4th Ave | 36th St | NW | 40.654676 | -74.004306 | (40.655144, -74.003549) | (40.654676, -74.004306) |
| 3 | BMT | 4 Avenue | 36th St | 40.655144 | -74.003549 | N | R | NaN | NaN | NaN | ... | False | NaN | True | 4th Ave | 36th St | NE | 40.654365 | -74.004113 | (40.655144, -74.003549) | (40.654365, -74.004113) |
| 4 | BMT | 4 Avenue | 36th St | 40.655144 | -74.003549 | N | R | NaN | NaN | NaN | ... | False | NaN | True | 4th Ave | 36th St | NW | 40.654490 | -74.004499 | (40.655144, -74.003549) | (40.65449, -74.004499) |
5 rows × 32 columns
NYC_Sub=NYC_Sub.iloc[:,:5]
len(NYC_Sub)
NYC_Sub["Station Name"].nunique()
1868
356
NYC_Sub=NYC_Sub.drop_duplicates(subset ="Station Name")
#Define the Function to Calculate the Distance m
def haversine(lon1, lat1, lon2, lat2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine
diff_long = lon2 - lon1
diff_lat = lat2 - lat1
a = sin(diff_lat/2)**2 + cos(lat1) * cos(lat2) * sin(diff_long/2)**2
c = 2 * asin(sqrt(a))
r = 6371 #
return c * r * 1000
def station_count_500m(long,lat,station_df):
count=0
for index_i, row_i in station_df.iterrows():
if haversine(long, lat, row_i["Station Longitude"], row_i["Station Latitude"])<=500:
count+=1
return count
#listing_Mah=pd.read_csv("listing_Mah.csv")
%%time
listing_Mah["station_count_500m"]=listing_Mah\
.apply(lambda x: station_count_500m(x.longitude,x.latitude,NYC_Sub),axis=1)
CPU times: user 9min 56s, sys: 2.89 s, total: 9min 59s Wall time: 18min 52s
#listing_Mah.to_csv("listing_Mah.csv")
sns.countplot(data=listing_Mah, x="station_count_500m");
listing_Mah.groupby("station_count_500m").log_price.mean().plot()
plt.xlabel("Number of Station")
plt.ylabel("Log price")
plt.title("Number of Station vs log price");
Using NYC_Top20.csv to count the nearby attractions
Data Source: https://www.timeout.com/newyork/attractions/top-attractions-in-manhattan
nyc_top20=pd.read_csv("NYC_Top20.csv")
nyc_top20.head()
| Attraction | Latitude | Longitude | |
|---|---|---|---|
| 0 | Central Park | 40.78509 | -73.968285 |
| 1 | Empire State Building | 40.74844 | -73.985664 |
| 2 | The Statue Of Liberty | 40.74882 | -73.985428 |
| 3 | World Trade Center | 40.71274 | -74.013382 |
| 4 | The Metropolitan Museum Of Art | 40.77897 | -73.962311 |
def landmark_1k(long,lat,df):
count=0
for index_i, row_i in df.iterrows():
if haversine(long, lat, row_i["Longitude"], row_i["Latitude"])<=1000:
count+=1
return count
%%time
listing_Mah["nyc_attraction_1000m"]=listing_Mah\
.apply(lambda x: landmark_1k(x.longitude,x.latitude,nyc_top20),axis=1)
CPU times: user 35 s, sys: 166 ms, total: 35.2 s Wall time: 35.4 s
listing_Mah.groupby("nyc_attraction_1000m").log_price.mean().plot()
plt.ylabel("log_price")
plt.xlabel("nyc_attraction_1000m")
plt.title("Number of Attractions vs Log_price");
cor=listing_Mah.corr()
cor[cor<1].log_price.sort_values().nlargest(10)
price 0.577437 accommodates 0.500606 bedrooms 0.382580 beds 0.342978 bath_count 0.234224 nyc_attraction_1000m 0.178438 host_total_listings_count 0.121050 host_listings_count 0.121050 amenities_count 0.105150 station_count_500m 0.098530 Name: log_price, dtype: float64
listing_Mah_dum=pd.get_dummies(data=listing_Mah,columns=["room_type","List_of_Neighborhood","property_type_cat"])
listing_Mah_dum.head()
| id | host_id | host_listings_count | host_total_listings_count | latitude | longitude | accommodates | bedrooms | beds | price | ... | List_of_Neighborhood_Downtown | List_of_Neighborhood_Islands | List_of_Neighborhood_Midtown | List_of_Neighborhood_Midtown and Downtown | List_of_Neighborhood_Uptown | property_type_cat_apartment | property_type_cat_hostel | property_type_cat_hotel | property_type_cat_house | property_type_cat_other | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2595 | 2845 | 6.0 | 6.0 | 40.75362 | -73.98377 | 2 | 1.0 | 1.0 | 175.0 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | 5178 | 8967 | 1.0 | 1.0 | 40.76489 | -73.98493 | 2 | 1.0 | 1.0 | 73.0 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 5 | 5203 | 7490 | 1.0 | 1.0 | 40.80178 | -73.96723 | 1 | 1.0 | 1.0 | 75.0 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 |
| 6 | 5552 | 8380 | 1.0 | 1.0 | 40.73552 | -74.01042 | 2 | 1.0 | 1.0 | 160.0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 8 | 6021 | 11528 | 2.0 | 2.0 | 40.79826 | -73.96113 | 1 | 1.0 | 1.0 | 85.0 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 |
5 rows × 48 columns
import sklearn as sk
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import scale,RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error,mean_absolute_error
X=listing_Mah_dum[[col for col in listing_Mah_dum.columns if col not in ['id',"host_id",'price','log_price']]]
y=listing_Mah_dum.log_price
columns=X.columns.tolist()
scaler=sk.preprocessing.StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X),columns=columns)
X.head()
| host_listings_count | host_total_listings_count | latitude | longitude | accommodates | bedrooms | beds | minimum_nights | maximum_nights | number_of_reviews | ... | List_of_Neighborhood_Downtown | List_of_Neighborhood_Islands | List_of_Neighborhood_Midtown | List_of_Neighborhood_Midtown and Downtown | List_of_Neighborhood_Uptown | property_type_cat_apartment | property_type_cat_hostel | property_type_cat_hotel | property_type_cat_house | property_type_cat_other | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.141675 | -0.141675 | -0.308178 | -0.421759 | -0.448822 | -0.389029 | -0.478616 | -0.283534 | -0.007086 | 0.673065 | ... | -0.616995 | -0.068052 | 1.938264 | -0.323001 | -0.842215 | 0.349693 | -0.072068 | -0.199415 | -0.263426 | -0.02567 |
| 1 | -0.170329 | -0.170329 | -0.013166 | -0.474510 | -0.448822 | -0.389029 | -0.478616 | -0.318957 | -0.007158 | 10.572714 | ... | -0.616995 | -0.068052 | 1.938264 | -0.323001 | -0.842215 | 0.349693 | -0.072068 | -0.199415 | -0.263426 | -0.02567 |
| 2 | -0.170329 | -0.170329 | 0.952495 | 0.330392 | -1.054028 | -0.389029 | -0.478616 | -0.318957 | -0.007158 | 2.303596 | ... | -0.616995 | -0.068052 | -0.515926 | -0.323001 | 1.187346 | 0.349693 | -0.072068 | -0.199415 | -0.263426 | -0.02567 |
| 3 | -0.170329 | -0.170329 | -0.781977 | -1.633658 | -0.448822 | -0.389029 | -0.478616 | -0.283534 | -0.007158 | 1.092345 | ... | 1.620759 | -0.068052 | -0.515926 | -0.323001 | -0.842215 | 0.349693 | -0.072068 | -0.199415 | -0.263426 | -0.02567 |
| 4 | -0.164598 | -0.164598 | 0.860353 | 0.607787 | -1.054028 | -0.389029 | -0.478616 | -0.318957 | -0.007111 | 2.420062 | ... | -0.616995 | -0.068052 | -0.515926 | -0.323001 | 1.187346 | 0.349693 | -0.072068 | -0.199415 | -0.263426 | -0.02567 |
5 rows × 44 columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
print(f'The shape of Xtrain:{X_train.shape}')
print(f'The shape of Xtest:{X_test.shape}')
print(f'The shape of ytrain:{y_train.shape}')
print(f'The shape of ytest:{y_test.shape}')
The shape of Xtrain:(14805, 44) The shape of Xtest:(4936, 44) The shape of ytrain:(14805,) The shape of ytest:(4936,)
%%time
n_estimators=[9,10,11,12]
max_depth=[i for i in range(10,16,2)]
max_features=[i for i in range(10,16,1)]
#min_samples_leaf=2
#min_samples_split=2
parameters = {
'n_estimators':n_estimators,
'max_depth':max_depth,
'max_features':max_features
#,'min_samples_leaf':min_samples_leaf,
#'min_samples_split':min_samples_split
}
rf = RandomForestRegressor(random_state=0)
GS=GridSearchCV(rf,parameters,cv=3)
GS.fit(X_train,y_train)
CPU times: user 37.5 s, sys: 848 ms, total: 38.4 s Wall time: 38.8 s
GridSearchCV(cv=3, estimator=RandomForestRegressor(random_state=0),
param_grid={'max_depth': [10, 12, 14],
'max_features': [10, 11, 12, 13, 14, 15],
'n_estimators': [9, 10, 11, 12]})
GS.best_params_
{'max_depth': 12, 'max_features': 10, 'n_estimators': 12}
nfolds=3
kf = KFold(n_splits=nfolds,random_state=0,shuffle=True)
def mse_score(model):
return cross_val_score(model,X,y,scoring='neg_mean_squared_error',cv=kf)
def mae_score(model):
return cross_val_score(model,X,y,scoring='neg_mean_absolute_error',cv=kf)
def cv_score(model):
return cross_val_score(model, X, y, cv=kf)
rf_model=RandomForestRegressor(random_state=0,n_estimators=12,max_depth=14,max_features=14,min_samples_leaf=2,min_samples_split=2)
rf_model = rf_model.fit(X_train,y_train)
rf_predict_ytrain = rf_model.predict(X_train)
rf_predict_ytest = rf_model.predict(X_test)
rf_result=pd.DataFrame({'model':['RandomForestRegressor'],
'Cross Validation Score:':cv_score(rf_model).mean(),
'MAE Score:':-mae_score(rf_model).mean(),
"MSE Score:":-mse_score(rf_model).mean(),
'R squared on Test:':r2_score(y_test,rf_predict_ytest),
'R squared on train:':r2_score(y_train,rf_predict_ytrain)})
rf_result
| model | Cross Validation Score: | MAE Score: | MSE Score: | R squared on Test: | R squared on train: | |
|---|---|---|---|---|---|---|
| 0 | RandomForestRegressor | 0.585436 | 0.319222 | 0.223763 | 0.574028 | 0.768439 |
inverse_test=np.expm1(y_test)
inverse_pred=np.expm1(rf_predict_ytest)
#error
inverse_mae=mean_absolute_error(inverse_test,inverse_pred)
inverse_rmse=mean_squared_error(inverse_test,inverse_pred)**0.5
print("Mean Absolute Error in USD is $"+ str(inverse_mae)[:6])
print("Root Mean Squared Error in USD is $"+ str(inverse_rmse)[:6])
Mean Absolute Error in USD is $79.806 Root Mean Squared Error in USD is $360.96
fig = plt.figure(figsize=(10,3))
sns.regplot((y_test),(rf_predict_ytest),color="grey")
plt.title("Residuals for the model");
importance = rf_model.feature_importances_
columns = X_train.columns
d = pd.Series(data=importance,index=columns)
fig = plt.figure(figsize=(21,6))
ax1 = fig.add_subplot(1,2,1)
ax1.set_title('Most Important Features',size=16)
ax2 = fig.add_subplot(1,2,2)
ax2.set_title('Least Important Features',size=16)
plt.sca(ax1)
plt.xticks(rotation=45, ha='right')
ax1.plot(d.nlargest(10))
plt.sca(ax2)
plt.xticks(rotation=45, ha='right')
ax2.plot(d.nsmallest(10),color="grey");
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn import linear_model
import sklearn as sk
from sklearn.model_selection import KFold
regs = [LinearRegression(), ElasticNet(), DecisionTreeRegressor(), GradientBoostingRegressor(), MLPRegressor()\
, SVR(),linear_model.Lasso()]
regs_name = ["LinearRegression", "ElasticNet", "DecisionTreeRegressor", "GradientBoostingRegressor", "MLPRegressor",
"SVR","Lasso"]
%%time
regression_dict={"Model":["RandomForestRegressor"],
"MAE":[-mae_score(rf_model).mean()]}
for reg,name in zip(regs,regs_name):
mae = -sk.model_selection.cross_val_score(reg,X,y,\
cv=kf,scoring='neg_mean_absolute_error').mean()
regression_dict["Model"].append(name)
regression_dict["MAE"].append(mae)
regression_result=pd.DataFrame(regression_dict).sort_values("MAE")
CPU times: user 2min 24s, sys: 17.8 s, total: 2min 42s Wall time: 1min 25s
ax=sns.barplot(x="MAE",y="Model",data=regression_result)
for p in ax.patches:
ax.annotate("%.4f" % p.get_width(), (p.get_x() + p.get_width(), p.get_y() + 1),
xytext=(5, 10), textcoords='offset points')
plt.title("Best Model: "+regression_result.Model[0]+" "+" MAE: "+str(round(regression_result.MAE[0],4)),size=14)
plt.show;
Findings:
RandomForestRegressor tunned by Grid Search performs the best among all with MAE=0.318 based on log term or $79 in USD. According to outcome of feature importance, top 10 contributed variables are related to private space, bathroom type and count, bedroom type and listing location, whereas other property type, neighborhood area and review scores are the least. Another distinctive finding, through combing Airbnb data with NYC trainsit and attractions, listing price highly correlated to the number of subway stations and attractions nearby the house.
Managerial insight:
Our regression model can sucessfully support the business by predicting the benchmark of listing price, and futher being used as the prototype of pricing recommendation system for new hosts. Besides, the result of feature importance indicates the factors that model can be improved in the future reseach and development. We believe our accomplishment can provide business insights on Airbnb operation in real business scene.
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 200)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
#df2.head()
# Significant difference in review scores, review volumes, & response rate
df2.groupby("host_is_superhost", axis=0)\
[["price", "review_scores_rating",\
"amenities_count", "number_of_reviews", "number_of_reviews_ltm", \
"host_response_rate"]].mean()
| price | review_scores_rating | amenities_count | number_of_reviews | number_of_reviews_ltm | host_response_rate | |
|---|---|---|---|---|---|---|
| host_is_superhost | ||||||
| RegularHost | 196.46 | 93.35 | 17.60 | 13.99 | 2.45 | 0.35 |
| SuperHost | 180.00 | 96.81 | 24.95 | 50.38 | 9.10 | 0.69 |
len(df2[df2.host_is_superhost == 'SuperHost'])
len(df2[df2.host_is_superhost == 'RegularHost'])
df2.host_is_superhost1.mean() * 100
2777
16964
14.067169849551695
sns.countplot(x = 'host_is_superhost', data = df2)
<AxesSubplot:xlabel='host_is_superhost', ylabel='count'>
df2['review_scores_rating.binned'] = pd.cut(df2['review_scores_rating'], bins=[0,85,93,96,100])
sns.catplot(x='review_scores_rating.binned',hue='host_is_superhost',\
data=df2, kind='count', aspect = 1)
<seaborn.axisgrid.FacetGrid at 0x7fd5d33a5f10>
# Superhost has a high correlation with number of amenities, as the top 1
# Then it's cleanliness, value, communication, and checkin
# Those areas should be addressed when regular hosts wants to improve their ratings
cor = df2.corr()
corsh = cor.host_is_superhost1.sort_values(ascending = False)[["amenities_count", "review_scores_cleanliness"\
, "review_scores_value", "review_scores_communication",\
"review_scores_checkin"]]
corsh
amenities_count 0.30 review_scores_cleanliness 0.20 review_scores_value 0.18 review_scores_communication 0.09 review_scores_checkin 0.09 Name: host_is_superhost1, dtype: float64
fig = plt.figure(figsize=(7,3))
corsh.plot(kind ="bar", rot = 90)
<AxesSubplot:>
# superhost has an average # of reviews of 9.1 while regular hosts only have 2.45 reviews
df2['number_of_reviews_ltm.binned'] = pd.cut(df2['number_of_reviews_ltm'], bins=[0,1,3,10,480])
sns.catplot(x='number_of_reviews_ltm.binned',kind = "count",hue='host_is_superhost',\
data=df2, aspect = 1);
# the average total number of reviews of superhost is 50 while regular hosts concentrated below 14
df2['number_of_reviews.binned'] = pd.cut(df2['number_of_reviews'], bins=[0,6,14,30,750])
sns.catplot(x='number_of_reviews.binned',kind = "count",hue='host_is_superhost',\
data=df2, aspect = 1);
# The response rate of superhost is twice as much as the regular host
sns.catplot(x = "host_is_superhost", y = "host_response_rate", kind = "bar", data = df2, aspect = 1)
<seaborn.axisgrid.FacetGrid at 0x7fd5d6cd7580>
plt.subplots(figsize=(8, 5))
sns.countplot("host_response_time",hue="host_is_superhost",data=df2,\
order = ["No responsing", "a few days or more", "within a day", "within a few hours","within an hour"])
(<Figure size 576x360 with 1 Axes>, <AxesSubplot:>)
<AxesSubplot:xlabel='host_response_time', ylabel='count'>
# for the scores between 93 and 96 supar hosts charges 42 dollars less than regular hosts
df2.groupby(["host_is_superhost", "review_scores_rating.binned"]).price.mean().unstack()
| review_scores_rating.binned | (0, 85] | (85, 93] | (93, 96] | (96, 100] |
|---|---|---|---|---|
| host_is_superhost | ||||
| RegularHost | 170.08 | 157.47 | 224.08 | 189.49 |
| SuperHost | 240.99 | 174.70 | 182.30 | 178.66 |
# super host suprisinglly charges less than regular host when their review scores are higher
sns.catplot(x = "review_scores_rating.binned", y = "price", hue = "host_is_superhost", kind = "bar", data = df2)
<seaborn.axisgrid.FacetGrid at 0x7fd5d69f0130>
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
sh = df2[df2.host_is_superhost=='SuperHost']
px.set_mapbox_access_token ='pk.eyJ1IjoiYWxhbnR6dGFuIiwiYSI6ImNqa2VqeHFnejB1dTEzcHBxczBhaGZhZ2gifQ.moVxmxTkE53tZDfBFr3uYA'
fig2 = px.scatter_mapbox(data_frame=sh,
lat="latitude",
lon="longitude",
color="host_is_superhost",
color_discrete_sequence=['#FF5A5F'],
opacity=0.7,
size_max=10)
fig2.update_layout(mapbox_style="carto-positron",\
mapbox_zoom=11, mapbox_center = {"lat": 40.75, "lon": -73.98})
Findings:
“Superhost” correlates most with amenities, then followed by cleanliness and value. Communication and check-in experience ranked 4th and 5th. For those above 93 points superhosts, surprisingly they charge less than regular hosts by 42 dollars on average per night.
Managerial insights:
We can help regular hosts to improve the areas aforementioned to elevate the superhost proportion in Manhattan. To maximize profit, superhosts with higher scores can try to increase their price step by step to test out the market and to match up with the high quality of the house and the superior service superhosts provided.
Objective: To promote regular host to superhost
1) From the overview of Superhost and Regular host distribution geographically, Top 3 goes to Harlem, Hell’s Kitchen, Midtown, and by each district, try to promote the proportion of regularhost to superhost to meet the 14% benchmark to help out the middle ranking and even the small district that are left out.
Findings
1) In general, Superhost outperform regular host in price (also the price value), response rate, acceptance rate, reviews volume, reviews score, however we see the socre of locations are very close, meaning there is a biased concentration of superhost.
Mangerial Insight
Methodology:
1) Taking a 5% discount off the average of Superhost in response rate, review scores of rating, cleaniness, value, communication, checkin and looking at the concentration of simulated superhost and regular host in each district, we take the common districts that hit the benchmark in each criterias:
2) By looking at the review score table (in green) generated above, we have below conclusion
Conclusion:
1) Boosting the ratio of superhost in the following areas For the middle stream: Lower East Side, China Town, Finance District For the lower stream: Civic Center, Little Italy, Marble Hill, Nolita
2) This finding also echoes the sentiment analysis that the high sentiment score are mostly concentrated and the lower part of Manhattan (as highlighted in yellow in the map)
</font>
df3 = df2.copy()
#Superhost of Manhattan accounts for 14% of total host
len(df2[listing_nyc.host_is_superhost == 'SuperHost'])
len(df2[listing_nyc.host_is_superhost == 'RegularHost'])
(len(df2[df2.host_is_superhost == 'SuperHost']))/(len(df2))
2777
16964
0.14067169849551694
sns.countplot(x = 'host_is_superhost', data = df2)
<AxesSubplot:xlabel='host_is_superhost', ylabel='count'>
df3_superhost = df3[df3['host_is_superhost'] =='SuperHost']
sns.countplot(y='neighbourhood_cleansed',\
data=df3_superhost, order = df3_superhost['neighbourhood_cleansed'].value_counts().index, \
alpha=0.7)
plt.tight_layout()
plt.show()
# Harlem, Hell's Kitchen, Midtown are the best, any room to promote more superhost of other locations
# if other locations also have a good concentration of good scoring?
<AxesSubplot:xlabel='count', ylabel='neighbourhood_cleansed'>
df3_neighbourhood = df3.groupby('neighbourhood_cleansed')['host_is_superhost1'].count()
df3_neighbourhood.apply(lambda x: x/df3_neighbourhood.sum()*100).sort_values(ascending=False)
neighbourhood_cleansed Harlem 12.42 Hell's Kitchen 9.03 Upper West Side 9.02 Upper East Side 8.24 Midtown 8.17 East Village 8.06 Chelsea 5.22 East Harlem 4.81 Washington Heights 4.21 Lower East Side 3.90 West Village 3.39 Financial District 2.58 Murray Hill 2.13 Kips Bay 2.12 Chinatown 2.00 Theater District 1.69 Greenwich Village 1.64 SoHo 1.64 Morningside Heights 1.53 Gramercy 1.51 Nolita 1.40 Inwood 1.20 Tribeca 0.95 Little Italy 0.73 Roosevelt Island 0.46 Battery Park City 0.37 Flatiron District 0.35 Two Bridges 0.34 NoHo 0.33 Stuyvesant Town 0.25 Civic Center 0.23 Marble Hill 0.07 Name: host_is_superhost1, dtype: float64
df4 = pd.get_dummies(df3, columns = ['host_is_superhost'])
#Overall Superhost distribution is 14%
#Investigate the undiscovered district to promote the district close to 14% benchmark
df4_neighhourhoodhost = df4.groupby('neighbourhood_cleansed')\
[['host_is_superhost_RegularHost','host_is_superhost_SuperHost']].mean()
df4_neighhourhoodhost.sort_values(by='host_is_superhost_SuperHost', ascending=False)
| host_is_superhost_RegularHost | host_is_superhost_SuperHost | |
|---|---|---|
| neighbourhood_cleansed | ||
| Marble Hill | 0.77 | 0.23 |
| Harlem | 0.78 | 0.22 |
| Washington Heights | 0.80 | 0.20 |
| East Harlem | 0.81 | 0.19 |
| Roosevelt Island | 0.82 | 0.18 |
| SoHo | 0.84 | 0.16 |
| Inwood | 0.85 | 0.15 |
| Hell's Kitchen | 0.86 | 0.14 |
| Midtown | 0.86 | 0.14 |
| NoHo | 0.86 | 0.14 |
| Gramercy | 0.86 | 0.14 |
| Lower East Side | 0.86 | 0.14 |
| West Village | 0.87 | 0.13 |
| East Village | 0.87 | 0.13 |
| Chinatown | 0.87 | 0.13 |
| Upper East Side | 0.88 | 0.12 |
| Upper West Side | 0.88 | 0.12 |
| Chelsea | 0.89 | 0.11 |
| Greenwich Village | 0.90 | 0.10 |
| Kips Bay | 0.90 | 0.10 |
| Theater District | 0.90 | 0.10 |
| Little Italy | 0.91 | 0.09 |
| Two Bridges | 0.91 | 0.09 |
| Morningside Heights | 0.91 | 0.09 |
| Financial District | 0.91 | 0.09 |
| Flatiron District | 0.91 | 0.09 |
| Murray Hill | 0.92 | 0.08 |
| Nolita | 0.93 | 0.07 |
| Battery Park City | 0.93 | 0.07 |
| Tribeca | 0.94 | 0.06 |
| Civic Center | 0.96 | 0.04 |
| Stuyvesant Town | 0.96 | 0.04 |
cor = df3.corr()
#cor[cor < 1].stack()[::2]
cor.host_is_superhost1.sort_values(ascending = False)
# Superhost features has a high correlation in number of amenities, response rate,
# cleaniness of apartment, communications
# Regular host themselves can improve in these areas to gain a differentiation as a superhost
host_is_superhost1 1.00 host_acceptance_rate 0.32 amenities_count 0.30 number_of_reviews 0.29 reviews_per_month 0.28 host_response_rate 0.26 number_of_reviews_ltm 0.24 review_scores_cleanliness 0.20 review_scores_value 0.18 review_scores_rating 0.15 availability_90 0.11 availability_60 0.11 availability_365 0.11 review_scores_accuracy 0.11 availability_30 0.10 review_scores_communication 0.09 review_scores_checkin 0.09 number_of_reviews_l30d 0.08 longitude 0.08 latitude 0.08 beds 0.06 accommodates 0.05 review_scores_location 0.04 bedrooms 0.04 has_availability 0.01 maximum_nights -0.00 instant_bookable -0.01 price -0.01 minimum_nights -0.02 calculated_host_listings_count_shared_rooms -0.02 host_total_listings_count -0.04 host_listings_count -0.04 calculated_host_listings_count_private_rooms -0.05 calculated_host_listings_count_entire_homes -0.06 calculated_host_listings_count -0.08 Name: host_is_superhost1, dtype: float64
# Significant difference in price, response rate, acceptance rate, reviews volume, reviews score
# From Airbnb perspective, Airbnb can give overview on these factors and reconsider to tag the regularhost as superhost
# if the regular host can hit the superhost benchmark
df3.groupby("host_is_superhost", axis = 0).mean().style.background_gradient(cmap='Pastel1_r')
| host_response_rate | host_acceptance_rate | host_is_superhost1 | host_listings_count | host_total_listings_count | accommodates | bedrooms | beds | price | minimum_nights | maximum_nights | has_availability | availability_30 | availability_60 | availability_90 | availability_365 | number_of_reviews | number_of_reviews_ltm | number_of_reviews_l30d | review_scores_rating | review_scores_accuracy | review_scores_cleanliness | review_scores_checkin | review_scores_communication | review_scores_location | review_scores_value | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_entire_homes | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | amenities_count | latitude | longitude | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| host_is_superhost | |||||||||||||||||||||||||||||||||||
| RegularHost | 0.352758 | 0.431125 | 0.000000 | 33.406331 | 33.406331 | 2.708029 | 1.218109 | 1.434921 | 196.457410 | 11.202959 | 127236.186395 | 0.999705 | 7.859939 | 17.700660 | 28.258842 | 104.315079 | 13.985322 | 2.449010 | 0.114183 | 93.350153 | 9.639059 | 9.069382 | 9.745225 | 9.756013 | 9.775171 | 9.149198 | 0.351981 | 11.006249 | 7.734673 | 2.900024 | 0.112768 | 0.425130 | 17.598031 | 40.764214 | -73.975208 |
| SuperHost | 0.694019 | 0.844364 | 1.000000 | 14.324451 | 14.324451 | 2.946705 | 1.289881 | 1.613612 | 180.001260 | 9.790421 | 588.973353 | 1.000000 | 11.329852 | 25.780338 | 40.869644 | 147.944184 | 50.378106 | 9.100828 | 0.338855 | 96.812387 | 9.894851 | 9.628376 | 9.933021 | 9.953907 | 9.852359 | 9.592726 | 0.338855 | 3.607850 | 2.727404 | 0.788621 | 0.029888 | 1.284894 | 24.952467 | 40.772592 | -73.970142 |
df3_response= df3[['host_response_time', 'host_response_rate', 'host_is_superhost',\
'host_acceptance_rate', 'neighbourhood_cleansed','latitude','longitude']]
#By the big overall picture, regular host is significantly below water due to the
#nature of its high proportion and significant number in no response
df3_response.groupby('host_is_superhost')\
[['host_response_rate','host_acceptance_rate']].mean().style.background_gradient(cmap='Pastel1_r')
| host_response_rate | host_acceptance_rate | |
|---|---|---|
| host_is_superhost | ||
| RegularHost | 0.352758 | 0.431125 |
| SuperHost | 0.694019 | 0.844364 |
#Superhost outperform regular host in responding swiftly to and accepting guests requests
#More noteworth to note, for prolonged cases of inquiries, superhost outperform regular host to accept guests requests
#However, we also can observe quite a big chunk of regular host also provide feedback swiftly and friendly in
#accepting guests requests
df3_response.groupby(['host_response_time','host_is_superhost']).agg({'host_response_rate':['mean','count'],\
'host_acceptance_rate':['mean','count']})\
.style.background_gradient(cmap='Pastel1_r')
| host_response_rate | host_acceptance_rate | ||||
|---|---|---|---|---|---|
| mean | count | mean | count | ||
| host_response_time | host_is_superhost | ||||
| No responsing | RegularHost | 0.000000 | 9685 | 0.186962 | 9685 |
| SuperHost | 0.000000 | 757 | 0.783038 | 757 | |
| a few days or more | RegularHost | 0.112000 | 880 | 0.544420 | 880 |
| SuperHost | 0.082857 | 35 | 0.907429 | 35 | |
| within a day | RegularHost | 0.785072 | 1323 | 0.588103 | 1323 |
| SuperHost | 0.892305 | 321 | 0.747882 | 321 | |
| within a few hours | RegularHost | 0.924241 | 1568 | 0.758176 | 1568 |
| SuperHost | 0.971738 | 535 | 0.850636 | 535 | |
| within an hour | RegularHost | 0.968575 | 3508 | 0.871411 | 3508 |
| SuperHost | 0.990328 | 1129 | 0.907989 | 1129 | |
df3_review = df3[['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',\
'review_scores_checkin','review_scores_communication','review_scores_location',\
'review_scores_value','host_is_superhost','neighbourhood_group_cleansed',\
'neighbourhood_cleansed','number_of_reviews', 'number_of_reviews_ltm','number_of_reviews_l30d',\
'latitude', 'longitude','property_type_cat']]
df3_review['review_scores_rating']=df3_review['review_scores_rating']/10.0
df3_review['overall_rating_average'] = (df3_review['review_scores_rating']+\
df3_review['review_scores_accuracy']+\
df3_review['review_scores_cleanliness']+\
df3_review['review_scores_checkin']+\
df3_review['review_scores_communication']+\
df3_review['review_scores_location']+\
df3_review['review_scores_value'])/7
#Superhost has a higher concentration in high score rating, same distribution for the other ratings type
#But we oberserve there is quite a number of
#Meaning there is room for regular host to be upgraded
sns.catplot(y='host_is_superhost', data=df3_review, x='review_scores_rating',aspect=2)
<seaborn.axisgrid.FacetGrid at 0x7fd5d1fabf40>
# Pinpointing the particular good indicators of Superhost,
# we want to investigate closer on the geographical distribution of such listing
df3_review.groupby('host_is_superhost').\
agg({'overall_rating_average':['mean','count'],\
'review_scores_rating':['mean','count'],\
'review_scores_cleanliness':['mean','count'],\
'review_scores_value':['mean','count'],\
'review_scores_communication':['mean','count'],\
'review_scores_checkin':['mean','count'],\
'number_of_reviews':['mean','count'],\
}).style.background_gradient(cmap='Pastel1_r')
| overall_rating_average | review_scores_rating | review_scores_cleanliness | review_scores_value | review_scores_communication | review_scores_checkin | number_of_reviews | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| mean | count | mean | count | mean | count | mean | count | mean | count | mean | count | mean | count | |
| host_is_superhost | ||||||||||||||
| RegularHost | 9.495581 | 16964 | 9.335015 | 16964 | 9.069382 | 16964 | 9.149198 | 16964 | 9.756013 | 16964 | 9.745225 | 16964 | 13.985322 | 16964 |
| SuperHost | 9.790925 | 2777 | 9.681239 | 2777 | 9.628376 | 2777 | 9.592726 | 2777 | 9.953907 | 2777 | 9.933021 | 2777 | 50.378106 | 2777 |
#Take a closer look at top performers of superhost
#By cleaniness => Civic Center, Marble Hill(full score), Little Italy (9.92)
#By Communication => Civic Center, Little Italy
#By price value => Civic Center, Marble Hill
#By number of reviews, call it popularity => Marble Hill, East Harlem
#By Overall, Civic Center and Marble Hill
#Civic Center, Marble Hill, Little Italy the TOP! In addition, the regular host scoring is also high
#Promote more of this area
df3_review.groupby(['neighbourhood_cleansed','host_is_superhost']).\
agg({'overall_rating_average':['mean','count'],\
'review_scores_cleanliness':['mean','count'],\
'review_scores_communication':['mean','count'],\
'review_scores_value':['mean','count'],\
'number_of_reviews':['mean','count'],\
}).style.background_gradient(cmap='Greens')
| overall_rating_average | review_scores_cleanliness | review_scores_communication | review_scores_value | number_of_reviews | |||||||
|---|---|---|---|---|---|---|---|---|---|---|---|
| mean | count | mean | count | mean | count | mean | count | mean | count | ||
| neighbourhood_cleansed | host_is_superhost | ||||||||||
| Battery Park City | RegularHost | 9.578782 | 68 | 9.000000 | 68 | 9.779412 | 68 | 9.176471 | 68 | 6.500000 | 68 |
| SuperHost | 9.785714 | 5 | 9.400000 | 5 | 10.000000 | 5 | 9.400000 | 5 | 3.600000 | 5 | |
| Chelsea | RegularHost | 9.487996 | 914 | 9.109409 | 914 | 9.723195 | 914 | 9.098468 | 914 | 15.123632 | 914 |
| SuperHost | 9.813793 | 116 | 9.637931 | 116 | 9.931034 | 116 | 9.594828 | 116 | 54.663793 | 116 | |
| Chinatown | RegularHost | 9.430980 | 344 | 9.026163 | 344 | 9.703488 | 344 | 9.127907 | 344 | 14.156977 | 344 |
| SuperHost | 9.788857 | 50 | 9.540000 | 50 | 9.940000 | 50 | 9.640000 | 50 | 47.180000 | 50 | |
| Civic Center | RegularHost | 9.514610 | 44 | 9.136364 | 44 | 9.704545 | 44 | 9.113636 | 44 | 5.113636 | 44 |
| SuperHost | 9.892857 | 2 | 10.000000 | 2 | 10.000000 | 2 | 10.000000 | 2 | 23.000000 | 2 | |
| East Harlem | RegularHost | 9.375103 | 766 | 8.984334 | 766 | 9.718016 | 766 | 9.083551 | 766 | 20.255875 | 766 |
| SuperHost | 9.763121 | 184 | 9.701087 | 184 | 9.978261 | 184 | 9.679348 | 184 | 68.771739 | 184 | |
| East Village | RegularHost | 9.542816 | 1382 | 9.028944 | 1382 | 9.820550 | 1382 | 9.217800 | 1382 | 15.324168 | 1382 |
| SuperHost | 9.789864 | 210 | 9.542857 | 210 | 9.971429 | 210 | 9.523810 | 210 | 60.404762 | 210 | |
| Financial District | RegularHost | 9.535668 | 465 | 9.062366 | 465 | 9.756989 | 465 | 9.150538 | 465 | 7.750538 | 465 |
| SuperHost | 9.826667 | 45 | 9.600000 | 45 | 9.933333 | 45 | 9.644444 | 45 | 30.933333 | 45 | |
| Flatiron District | RegularHost | 9.494785 | 63 | 9.142857 | 63 | 9.714286 | 63 | 9.095238 | 63 | 8.650794 | 63 |
| SuperHost | 9.595238 | 6 | 9.666667 | 6 | 9.500000 | 6 | 9.333333 | 6 | 57.666667 | 6 | |
| Gramercy | RegularHost | 9.547248 | 257 | 9.101167 | 257 | 9.778210 | 257 | 9.214008 | 257 | 11.626459 | 257 |
| SuperHost | 9.782230 | 41 | 9.463415 | 41 | 10.000000 | 41 | 9.512195 | 41 | 60.756098 | 41 | |
| Greenwich Village | RegularHost | 9.587980 | 290 | 9.217241 | 290 | 9.810345 | 290 | 9.220690 | 290 | 9.603448 | 290 |
| SuperHost | 9.675630 | 34 | 9.500000 | 34 | 9.764706 | 34 | 9.558824 | 34 | 59.705882 | 34 | |
| Harlem | RegularHost | 9.490474 | 1909 | 9.094814 | 1909 | 9.772656 | 1909 | 9.248298 | 1909 | 19.305919 | 1909 |
| SuperHost | 9.809357 | 542 | 9.723247 | 542 | 9.957565 | 542 | 9.697417 | 542 | 56.234317 | 542 | |
| Hell's Kitchen | RegularHost | 9.463012 | 1531 | 9.029393 | 1531 | 9.738080 | 1531 | 9.055519 | 1531 | 18.858263 | 1531 |
| SuperHost | 9.785259 | 251 | 9.613546 | 251 | 9.940239 | 251 | 9.549801 | 251 | 54.689243 | 251 | |
| Inwood | RegularHost | 9.502416 | 201 | 9.094527 | 201 | 9.746269 | 201 | 9.288557 | 201 | 13.019900 | 201 |
| SuperHost | 9.799592 | 35 | 9.600000 | 35 | 9.942857 | 35 | 9.742857 | 35 | 41.057143 | 35 | |
| Kips Bay | RegularHost | 9.468267 | 375 | 9.010667 | 375 | 9.749333 | 375 | 9.093333 | 375 | 12.333333 | 375 |
| SuperHost | 9.776080 | 43 | 9.534884 | 43 | 10.000000 | 43 | 9.348837 | 43 | 15.302326 | 43 | |
| Little Italy | RegularHost | 9.568939 | 132 | 9.280303 | 132 | 9.643939 | 132 | 9.272727 | 132 | 8.666667 | 132 |
| SuperHost | 9.919780 | 13 | 9.923077 | 13 | 10.000000 | 13 | 9.692308 | 13 | 70.307692 | 13 | |
| Lower East Side | RegularHost | 9.480988 | 665 | 9.004511 | 665 | 9.748872 | 665 | 9.153383 | 665 | 16.766917 | 665 |
| SuperHost | 9.777687 | 105 | 9.580952 | 105 | 9.971429 | 105 | 9.600000 | 105 | 56.552381 | 105 | |
| Marble Hill | RegularHost | 9.704286 | 10 | 9.500000 | 10 | 9.900000 | 10 | 9.500000 | 10 | 17.900000 | 10 |
| SuperHost | 9.938095 | 3 | 10.000000 | 3 | 10.000000 | 3 | 10.000000 | 3 | 27.000000 | 3 | |
| Midtown | RegularHost | 9.439002 | 1386 | 9.059885 | 1386 | 9.686869 | 1386 | 8.983405 | 1386 | 8.101732 | 1386 |
| SuperHost | 9.762240 | 227 | 9.568282 | 227 | 9.960352 | 227 | 9.453744 | 227 | 26.951542 | 227 | |
| Morningside Heights | RegularHost | 9.557609 | 276 | 9.061594 | 276 | 9.862319 | 276 | 9.250000 | 276 | 7.865942 | 276 |
| SuperHost | 9.857143 | 27 | 9.629630 | 27 | 10.000000 | 27 | 9.703704 | 27 | 61.370370 | 27 | |
| Murray Hill | RegularHost | 9.450716 | 389 | 9.059126 | 389 | 9.688946 | 389 | 8.992288 | 389 | 6.272494 | 389 |
| SuperHost | 9.731250 | 32 | 9.531250 | 32 | 9.906250 | 32 | 9.375000 | 32 | 56.093750 | 32 | |
| NoHo | RegularHost | 9.621939 | 56 | 9.303571 | 56 | 9.785714 | 56 | 9.321429 | 56 | 15.589286 | 56 |
| SuperHost | 9.726984 | 9 | 9.222222 | 9 | 10.000000 | 9 | 9.333333 | 9 | 18.222222 | 9 | |
| Nolita | RegularHost | 9.577043 | 257 | 9.136187 | 257 | 9.828794 | 257 | 9.182879 | 257 | 14.964981 | 257 |
| SuperHost | 9.775188 | 19 | 9.684211 | 19 | 9.894737 | 19 | 9.473684 | 19 | 35.000000 | 19 | |
| Roosevelt Island | RegularHost | 9.012000 | 75 | 8.226667 | 75 | 9.373333 | 75 | 8.800000 | 75 | 9.080000 | 75 |
| SuperHost | 9.895536 | 16 | 9.750000 | 16 | 10.000000 | 16 | 9.750000 | 16 | 46.750000 | 16 | |
| SoHo | RegularHost | 9.556096 | 273 | 9.201465 | 273 | 9.750916 | 273 | 9.179487 | 273 | 17.293040 | 273 |
| SuperHost | 9.747339 | 51 | 9.607843 | 51 | 9.901961 | 51 | 9.470588 | 51 | 48.568627 | 51 | |
| Stuyvesant Town | RegularHost | 9.429464 | 48 | 8.937500 | 48 | 9.604167 | 48 | 9.145833 | 48 | 3.000000 | 48 |
| SuperHost | 9.742857 | 2 | 9.500000 | 2 | 10.000000 | 2 | 9.500000 | 2 | 37.500000 | 2 | |
| Theater District | RegularHost | 9.366113 | 301 | 8.940199 | 301 | 9.561462 | 301 | 8.850498 | 301 | 13.435216 | 301 |
| SuperHost | 9.777922 | 33 | 9.545455 | 33 | 9.969697 | 33 | 9.545455 | 33 | 52.545455 | 33 | |
| Tribeca | RegularHost | 9.552082 | 175 | 9.114286 | 175 | 9.811429 | 175 | 9.051429 | 175 | 5.068571 | 175 |
| SuperHost | 9.760714 | 12 | 9.500000 | 12 | 10.000000 | 12 | 9.250000 | 12 | 40.833333 | 12 | |
| Two Bridges | RegularHost | 9.434192 | 61 | 9.147541 | 61 | 9.688525 | 61 | 9.196721 | 61 | 18.081967 | 61 |
| SuperHost | 9.664286 | 6 | 9.500000 | 6 | 10.000000 | 6 | 9.166667 | 6 | 33.166667 | 6 | |
| Upper East Side | RegularHost | 9.558100 | 1433 | 9.157711 | 1433 | 9.820656 | 1433 | 9.181438 | 1433 | 11.704117 | 1433 |
| SuperHost | 9.764948 | 194 | 9.567010 | 194 | 9.948454 | 194 | 9.505155 | 194 | 36.943299 | 194 | |
| Upper West Side | RegularHost | 9.518419 | 1569 | 9.050351 | 1569 | 9.773741 | 1569 | 9.191205 | 1569 | 12.899299 | 1569 |
| SuperHost | 9.831334 | 212 | 9.627358 | 212 | 9.971698 | 212 | 9.641509 | 212 | 50.216981 | 212 | |
| Washington Heights | RegularHost | 9.466724 | 668 | 9.029940 | 668 | 9.750000 | 668 | 9.276946 | 668 | 12.437126 | 668 |
| SuperHost | 9.799912 | 163 | 9.705521 | 163 | 9.950920 | 163 | 9.736196 | 163 | 48.883436 | 163 | |
| West Village | RegularHost | 9.594984 | 581 | 9.185886 | 581 | 9.814114 | 581 | 9.266781 | 581 | 14.407917 | 581 |
| SuperHost | 9.815730 | 89 | 9.640449 | 89 | 9.932584 | 89 | 9.494382 | 89 | 49.449438 | 89 | |
#93 of rating score should be a threshold of SuperHost in review rating score
#Significant difference in higher rating score range above 93
df3['review_scores_rating.binned'] = pd.cut(df3['review_scores_rating'], bins=[0,85,93,96,99,100])
sns.catplot(x='review_scores_rating.binned', y='number_of_reviews_ltm',hue='host_is_superhost',\
data=df3, kind='bar', aspect = 2)
<seaborn.axisgrid.FacetGrid at 0x7fd5d300a790>
#Setting up some threshold of perspective aspect to find out the location
#Assumption: take a 5% discount off the Superhost average to detect any obvious distinct concentration
df3_response['response_superhost']=(df3_response['host_response_rate']>0.608)\
& (df3_response['host_acceptance_rate']>0.798)
df3_review['review_scores_rating_superhost'] = df3_review['review_scores_rating']>9.196
df3_review['review_scores_cleanliness_superhost'] = df3_review['review_scores_cleanliness']>9.139
df3_review['review_scores_value_superhost'] = df3_review['review_scores_value']>9.11
df3_review['review_scores_communication_superhost'] = df3_review['review_scores_communication']>9.45
df3_review['review_scores_checkin_superhost'] = df3_review['review_scores_checkin']>9.43
df3['amenities_count_superhost'] = df3['amenities_count']>23.7
#By combination of host response rate and host acceptance rate
#Lower east side, Chinatown, Little Italy, Financial District, Nolita
px.set_mapbox_access_token ='pk.eyJ1IjoiYWxhbnR6dGFuIiwiYSI6ImNqa2VqeHFnejB1dTEzcHBxczBhaGZhZ2gifQ.moVxmxTkE53tZDfBFr3uYA'
fig2 = px.scatter_mapbox(data_frame=df3_response,
lat="latitude",
lon="longitude",
color="response_superhost",
color_continuous_scale=px.colors.cyclical.IceFire,
hover_name = "neighbourhood_cleansed"
)
fig2.update_layout(mapbox_style="carto-positron",\
mapbox_zoom=10, mapbox_center = {"lat": 40.75, "lon": -74.00})
#By review scores rating
#Lower east side, Chinatown, Little Italy, Financial District, Nolita, Chelsea, Washington Heights
px.set_mapbox_access_token ='pk.eyJ1IjoiYWxhbnR6dGFuIiwiYSI6ImNqa2VqeHFnejB1dTEzcHBxczBhaGZhZ2gifQ.moVxmxTkE53tZDfBFr3uYA'
fig3 = px.scatter_mapbox(data_frame=df3_review,
lat="latitude",
lon="longitude",
color="review_scores_rating_superhost",
color_continuous_scale=px.colors.cyclical.IceFire,
hover_name = "neighbourhood_cleansed"
)
fig3.update_layout(mapbox_style="carto-positron",\
mapbox_zoom=10, mapbox_center = {"lat": 40.75, "lon": -74.00})
#By review scores cleanliness
#No obvious pattern, overall is good, those beforementioned districts are included
px.set_mapbox_access_token ='pk.eyJ1IjoiYWxhbnR6dGFuIiwiYSI6ImNqa2VqeHFnejB1dTEzcHBxczBhaGZhZ2gifQ.moVxmxTkE53tZDfBFr3uYA'
fig4 = px.scatter_mapbox(data_frame=df3_review,
lat="latitude",
lon="longitude",
color="review_scores_cleanliness_superhost",
color_continuous_scale=px.colors.cyclical.IceFire,
hover_name = "neighbourhood_cleansed"
)
fig4.update_layout(mapbox_style="carto-positron",\
mapbox_zoom=10, mapbox_center = {"lat": 40.75, "lon": -74.00})
#By review scores value
#No obvious pattern, overall is good, those beforementioned districts are included
px.set_mapbox_access_token ='pk.eyJ1IjoiYWxhbnR6dGFuIiwiYSI6ImNqa2VqeHFnejB1dTEzcHBxczBhaGZhZ2gifQ.moVxmxTkE53tZDfBFr3uYA'
fig5 = px.scatter_mapbox(data_frame=df3_review,
lat="latitude",
lon="longitude",
color="review_scores_value_superhost",
color_continuous_scale=px.colors.cyclical.IceFire,
hover_name = "neighbourhood_cleansed"
)
fig5.update_layout(mapbox_style="carto-positron",\
mapbox_zoom=10, mapbox_center = {"lat": 40.75, "lon": -74.00})
#By review scores communication, a mirror indicator to the satisfaction on response and acceptance rate
#Finance district, upper east side, kinda contradict to the response rate, probably not all feedback satisfies client
px.set_mapbox_access_token ='pk.eyJ1IjoiYWxhbnR6dGFuIiwiYSI6ImNqa2VqeHFnejB1dTEzcHBxczBhaGZhZ2gifQ.moVxmxTkE53tZDfBFr3uYA'
fig6 = px.scatter_mapbox(data_frame=df3_review,
lat="latitude",
lon="longitude",
color="review_scores_communication_superhost",
color_continuous_scale=px.colors.cyclical.IceFire,
hover_name = "neighbourhood_cleansed"
)
fig6.update_layout(mapbox_style="carto-positron",\
mapbox_zoom=10, mapbox_center = {"lat": 40.75, "lon": -74.00})
#By review scores checkin
# Those with river views and park view
px.set_mapbox_access_token ='pk.eyJ1IjoiYWxhbnR6dGFuIiwiYSI6ImNqa2VqeHFnejB1dTEzcHBxczBhaGZhZ2gifQ.moVxmxTkE53tZDfBFr3uYA'
fig7 = px.scatter_mapbox(data_frame=df3_review,
lat="latitude",
lon="longitude",
color="review_scores_checkin_superhost",
color_continuous_scale=px.colors.cyclical.IceFire,
hover_name = "neighbourhood_cleansed"
)
fig7.update_layout(mapbox_style="carto-positron",\
mapbox_zoom=10, mapbox_center = {"lat": 40.75, "lon": -74.00})
#By Amenities_count=> very few apartments fulfill the need, not a key factor to promote.
#more concentrated in lower east side and hell's kitchen
px.set_mapbox_access_token ='pk.eyJ1IjoiYWxhbnR6dGFuIiwiYSI6ImNqa2VqeHFnejB1dTEzcHBxczBhaGZhZ2gifQ.moVxmxTkE53tZDfBFr3uYA'
fig8 = px.scatter_mapbox(data_frame=df3,
lat="latitude",
lon="longitude",
color="amenities_count_superhost",
color_continuous_scale=px.colors.cyclical.IceFire,
hover_name="neighbourhood_cleansed"
)
fig8.update_layout(mapbox_style="carto-positron",\
mapbox_zoom=10, mapbox_center = {"lat": 40.75, "lon": -73.98})